In [19]:
import os
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from statsmodels.discrete.discrete_model import Logit, LogitResults
In [5]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('display.float_format', '{:.2f}'.format)
In [7]:
df = pd.read_pickle('../Chapter 7 - Data Preparation and Visualization/claims_df')
In [32]:
df.shape
Out[32]:
In [9]:
disease = ['SP_ALZHDMTA','SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN','SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
gender = ['gender_2']
ESRD = ['ESRD_Y']
In [10]:
X = df[disease+gender+ESRD]
y = df.HIGH_COST
In [11]:
lab_enc = LabelEncoder()
In [12]:
lab_enc.fit_transform(y)
Out[12]:
In [110]:
lm = LogisticRegression()
In [111]:
lm.fit(X, y)
Out[111]:
In [112]:
lm.score(X, y)
Out[112]:
In [113]:
def generate_conf_mat(model, X, y):
conf_mat = confusion_matrix(model.predict(X), y)
conf_mat = pd.DataFrame(conf_mat/conf_mat.sum())
conf_mat.columns = ['0_pred', '1_pred']
conf_mat.index=['0_actual', '1_actual']
return conf_mat
In [114]:
conf_mat = generate_conf_mat(lm, X, y)
In [115]:
conf_mat
Out[115]:
In [116]:
lm2 = Logit(y, X)
In [117]:
results = lm2.fit()
In [118]:
results.summary()
Out[118]:
In [119]:
from sklearn.svm import LinearSVC
In [120]:
mod = svm.LinearSVC(class_weight='balanced')
In [121]:
mod.fit(X, y)
Out[121]:
In [122]:
conf_mat = generate_conf_mat(mod, X, y)
In [123]:
conf_mat
Out[123]:
In [ ]:
In [ ]: